This is an R Markdown Notebook for analysis using data on the DC Bus System (WMATA Metrobus). The data were obtained here:
https://planitmetro.com/2016/11/16/data-download-metrobus-vehicle-location-data/
Control + Alt + Shift + m = rename in scope
Load the packages to be used.
Get the data.
First let’s check the working directory.
getwd()
[1] "/Users/mdturse/Desktop/Analytics/DCMetroBus"
Then, actually get the data.
The working directory was changed to /Users/mdturse/Desktop/Analytics/DCMetroBus/Bus AVL Oct 2016 inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the the working directory for notebook chunks.
Put the daily data together.
AllDays <- bind_rows(list(Oct03Raw, Oct04Raw, Oct05Raw, Oct06Raw, Oct07Raw),
.id = c("group")
)
Unequal factor levels: coercing to characterUnequal factor levels: coercing to characterUnequal factor levels: coercing to characterUnequal factor levels: coercing to characterUnequal factor levels: coercing to characterUnequal factor levels: coercing to character
# dim(AllDays)
str(AllDays)
'data.frame': 3119443 obs. of 18 variables:
$ group : chr "1" "1" "1" "1" ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 6 6 ...
$ Route_Direction : chr "LOOP" "LOOP" "LOOP" "LOOP" ...
$ Stop_Sequence : int 7 7 6 3 2 8 1 1 2 3 ...
$ Stop_ID : chr "5004572" "5004572" "5004573" "5002210" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ Event_Type : int 4 5 4 4 4 3 3 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 2 3 3 3 1 1 3 3 3 ...
$ Event_Time : chr "10-3-16 6:06:47 AM" "10-3-16 6:07:50 AM" "10-3-16 6:09:47 AM" "10-3-16 6:10:24 AM" ...
$ Departure_Time : chr "10-3-16 6:06:47 AM" "10-3-16 6:08:01 AM" "10-3-16 6:09:47 AM" "10-3-16 6:10:24 AM" ...
$ Dwell_Time : int 0 11 0 0 0 0 104 0 0 0 ...
$ Delta_Time : int -177 -27 24 165 25 73 719 0 74 76 ...
$ Odometer_Distance: int 43543 43543 45139 46418 50115 51074 51303 53836 55633 56163 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 253 97 276 15 119 100 89 274 104 ...
Deleting old data frames.
for (i in 3:7){
rm(list = ls(pattern = paste0("Oct0", i, "Raw")
)
)
message("Deleting Oct0", i, "Raw")
}
Deleting Oct03Raw
Deleting Oct04Raw
Deleting Oct05Raw
Deleting Oct06Raw
Deleting Oct07Raw
Updating variable types.
Then, sorting the data and adding a RowNumber (to be used for identifying rows later in the analyses.)
AllDays$group <- factor(AllDays$group)
Inspecting the values of Stop_ID, and finding that it can take the values “” (blank) and “NULL”.
View(group_by(AllDays_Sorted,
Stop_ID
) %>%
summarise(
Cnt = n()
) %>%
arrange(Stop_ID)
)
View(filter(AllDays_Sorted,
is.na(Stop_ID) |
Stop_ID == "" |
Stop_ID == "NULL"
) %>%
arrange(Stop_Desc)
)
Creating a table of distinct Stop_Desc values when Stop_ID is “” (blank) or “NULL”.
StopID_New <- filter(AllDays_Sorted,
is.na(Stop_ID) |
Stop_ID == "" |
Stop_ID == "NULL"
) %>%
select(Stop_ID, Stop_Desc) %>%
distinct() %>%
arrange(Stop_ID, Stop_Desc) %>%
mutate(StopID_New = 1:nrow(.)
)
View(StopID_New)
Creating a full updated table by filling in StopID_New for when Stop_ID is “” (blank) or NULL.
AllDays_StopIDNew <- left_join(AllDays_Sorted,
select(StopID_New,
Stop_Desc,
StopID_New
),
by = c("Stop_Desc" = "Stop_Desc")
) %>%
mutate(StopID_Clean = ifelse(is.na(StopID_New),
Stop_ID,
StopID_New
),
StopID_Indicator = factor(ifelse(is.na(StopID_New),
"ID_OK",
"ID_Bad"
)
)
)
rm(StopID_New)
rm(AllDays_Sorted)
str(AllDays_StopIDNew)
'data.frame': 3119443 obs. of 22 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 1 1 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 7 6 3 2 8 1 1 2 3 ...
$ Stop_ID : chr "5004572" "5004572" "5004573" "5002210" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ Event_Type : int 4 5 4 4 4 3 3 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 2 3 3 3 1 1 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:07:50" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:08:01" ...
$ Dwell_Time : int 0 11 0 0 0 0 104 0 0 0 ...
$ Delta_Time : int -177 -27 24 165 25 73 719 0 74 76 ...
$ Odometer_Distance: int 43543 43543 45139 46418 50115 51074 51303 53836 55633 56163 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 253 97 276 15 119 100 89 274 104 ...
$ RowNum_OG : int 1 2 3 4 5 6 7 8 9 10 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004572" "5004573" "5002210" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
# View(tail(AllDays_StopIDNew, 500))
# View(filter(AllDays_StopIDNew,
# Stop_Desc == "METROWAY ANNNOUCEMNT CORR"
# )
# )
Feature engineering.
Inspecting incidences of consecutive Stop_IDs. This is done because investigation showed that many conseutive events occurr at the same Stop_ID, but with various Dwell_Times, Odometer_Distances, etc. All of which affect calculations and analyses.
Create data on the runs (consecutive Stop_IDs).
StopID_Runs <- rle(AllDays_StopIDNew$StopID_Clean)
StopID_Runs$ends <- cumsum(StopID_Runs$lengths)
StopID_Runs$starts <- ifelse(is.na(lag(StopID_Runs$ends)
),
1,
lag(StopID_Runs$ends) + 1
)
str(StopID_Runs)
List of 4
$ lengths: int [1:2809529] 2 1 1 1 1 2 1 1 1 1 ...
$ values : chr [1:2809529] "5004572" "5004573" "5002210" "5002209" ...
$ ends : int [1:2809529] 2 3 4 5 6 8 9 10 11 12 ...
$ starts : num [1:2809529] 1 3 4 5 6 7 9 10 11 12 ...
- attr(*, "class")= chr "rle"
# class(StopID_Runs)
#
# StopID_Runs_df <- data.frame(unclass(StopID_Runs))
# str(StopID_Runs_df)
# class(StopID_Runs_df)
# rm(StopID_Runs_df)
Trying to link data on RunsGroups with the original data (AllDays_Sorted). The goal is to select only one record per RunsGroup - that being the record with the longest Dwell_Time.
I attempted this computation using both data.frames (dplyr) and data.tables (data.table). However, with 2,809,062 rows in one dataset and 3,119,443 rows in the other dataset, the current computation time is over 5 days…so I’m trying a different strategy to only select the first record in a run.
# Create a RunsGroup variable for each run
# StopID_Runs_df$RunsGroup <- paste0("g", seq(1:nrow(StopID_Runs_df)
# )
# )
#
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
#
# StopID_Runs_df <- StopID_Runs_df %>%
# mutate(RowNum = row_number()
# )
#
# str(StopID_Runs_df)
# head(StopID_Runs_df, 25)
# tail(StopID_Runs_df, 25)
#
#
# # Converting to data.tables for, hopefully, improved performance (speed) in computation
# StopID_Runs_dt <- data.table(StopID_Runs_df)
# setkey(StopID_Runs_dt, RowNum)
# str(StopID_Runs_dt)
#
# AllDays_Sorted_dt <- data.table(AllDays_Sorted)
# setkey(AllDays_Sorted_dt, RowNum_OG)
# str(AllDays_Sorted_dt)
# # rm(AllDays_Sorted_dt)
#
#
# # Actual loop to perform the computations and link to original data (AllDays_Sorted_dt)
# GroupData <- list()
# for(i in 1:nrow(StopID_Runs_dt)
# ) {
# assign(paste0("group_", i),
# StopID_Runs_dt[RowNum == i, RunsGroup]
# )
#
# ##### The code below is the same code as above, but done with dplyr #####
#
# # assign(paste0("group_", i),
# # filter(StopID_Runs_df,
# # RowNum == i
# # ) %>%
# # select(RunsGroup)
# # )
#
# assign(paste0("group_", i, "_start"),
# StopID_Runs_dt[RowNum == i, starts]
# )
#
# assign(paste0("group_", i, "_end"),
# StopID_Runs_dt[RowNum == i, ends]
# )
#
# assign(paste0("group_", i, "_rows"),
# AllDays_Sorted_dt[RowNum_OG >= as.numeric(get(paste0("group_", i, "_start")
# )
# ) &
# RowNum_OG <= as.numeric(get(paste0("group_", i, "_end")
# )
# ),
# RunsGroup := as.character(get(paste0("group_", i)
# )
# )
# ]
#
# ##### The code below is the same as the code above, but done with dplyr #####
#
# # filter(AllDays_Sorted,
# # between(RowNum_OG,
# # as.numeric(get(paste0("group_", i, "_start")
# # )
# # ),
# # as.numeric(get(paste0("group_", i, "_end")
# # )
# # )
# # )
# # ) %>%
# # mutate(RunsGroup = as.character(get(paste0("group_", i)
# # )
# # )
# # )
# )
#
# GroupData[[i]] <- get(paste0("group_", i, "_rows"))
#
# message("Processing Group ", i, " of 2,809,062")
# }
#
#
# GroupData_df <- rbind.fill(GroupData)
# str(GroupData_df)
# head(GroupData_df)
# tail(GroupData_df)
# # rm(GroupData_df)
#
#
# group_1
# group_1_start
# group_1_end
# group_1_rows
# group_2_rows
# group_3_rows
# group_50_rows
# str(group_50_rows)
# group_2809062_rows
# GroupData[[1]]
# GroupData[[50]]
#
#
# ##### Testing Area (Below) #####
# ##### Testing Area (Below) #####
# ##### Testing Area (Below) #####
#
# # head(StopID_Runs$starts, 20)
# # head(AllDays_NewOrder$Stop_ID, 20)
# #
# #
# # dat <- as.data.frame(c(1,1,7,7,7,9,6,8,2,2,2,1,1,1,1,1))
# # colnames(dat)[1] <- "dat"
# # r <- rle(dat$dat)
# # dat$run <- rep(r$lengths, r$lengths)
# # dat$runLag <- lag(dat$run)
# # dat$cond <- rep(r$values, r$lengths)
# # dat
# # View(dat)
When consecutive Stop_ID occurrs, only take the first occurrence. This is done because the computation time to select only the record with the longest Dwell_Time for each run was too long (over 5 days).
This is probably less than ideal with regards to Dwell_Time, but should not make much difference for calculations of travel time, speed, etc.
AllDays_FirstStopID <- AllDays_StopIDNew[StopID_Runs$starts, ]
dim(AllDays_StopIDNew)
[1] 3119443 22
dim(AllDays_FirstStopID)
[1] 2809529 22
nrow(AllDays_StopIDNew) - nrow(AllDays_FirstStopID)
[1] 309914
rm(AllDays_StopIDNew)
rm(StopID_Runs)
str(AllDays_FirstStopID)
'data.frame': 2809529 obs. of 22 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description: Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance: int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
Feature engineering.
Creating new variables.
AllDays_AddVars <- mutate(AllDays_FirstStopID,
Odometer_Distance_Mi = Odometer_Distance / 5280, #5,280 feet in 1 mile
Dwell_Time2 = as.numeric(Departure_Time - Event_Time),
Event_Time_Yr = as.integer(year(Event_Time)),
Event_Time_Mth = as.integer(month(Event_Time)),
Event_Time_Date = day(Event_Time),
Event_Time_Day = wday(Event_Time, label = TRUE),
Event_Time_Hr = hour(Event_Time),
Event_Time_Min = minute(Event_Time),
Event_Time_HrGroup = factor(ifelse(Event_Time_Hr < 3,
"Group0_2",
ifelse(Event_Time_Hr < 6,
"Group3_5",
ifelse(Event_Time_Hr < 9,
"Group6_8",
ifelse(Event_Time_Hr < 12,
"Group9_11",
ifelse(Event_Time_Hr < 15,
"Group12_14",
ifelse(Event_Time_Hr < 18,
"Group15_17",
ifelse(Event_Time_Hr < 21,
"Group18_20",
ifelse(Event_Time_Hr < 24,
"Group21_23"
)))))))),
levels = c("Group0_2",
"Group3_5",
"Group6_8",
"Group9_11",
"Group12_14",
"Group15_17",
"Group18_20",
"Group21_23"
),
ordered = TRUE
)
)
rm(AllDays_FirstStopID)
str(AllDays_AddVars)
'data.frame': 2809529 obs. of 31 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Odometer_Distance_Mi: num 8.25 8.55 8.79 9.49 9.67 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
# group_by(AllDays_AddVars,
# Event_Time_HrGroup
# ) %>%
# summarise(Cnts = n()
# )
# View(head(filter(AllDays_AddVars,
# Event_Time_Hr == 0
# ),
# 50
# )
# )
# View(head(AllDays_AddVars, 50))
Feature engineering.
Creating more variables. Creating a BusEvent row number for future identification purposes. Then, creating various variables to analyze distance traveled and speed.
AllDays_BusDay <- group_by(AllDays_AddVars,
Bus_ID,
Event_Time_Date
) %>%
mutate(BusDay_EventNum = row_number(), # used to identify Bus movements on a particular date
Odometer_Distance_Lag1 = lag(Odometer_Distance),
Latitude_L1 = lag(Latitude),
Longitude_L1 = lag(Longitude),
# Lat_Radian = Latitude*pi/180,
# Long_Radian = Longitude*pi/180,
# Lat_Radian_L1 = lag(Lat_Radian),
# Long_Radian_L1 = lag(Long_Radian),
# accounting for potential negative distances
TravelDistance_Ft = ifelse(Odometer_Distance > Odometer_Distance_Lag1,
Odometer_Distance - Odometer_Distance_Lag1,
NA
),
TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
# TravelDistance_Mi2 = gcd.hf(long1 = Long_Radian_L1,
# lat1 = Lat_Radian_L1,
# long2 = Long_Radian,
# lat2 = Lat_Radian
# ),
TravelDistance_Mi_Hvrs =
# ifelse((is.na(Longitude_L1) | is.na(Latitude_L1)
# ),
# NA,
distHaversine(cbind(Longitude_L1, Latitude_L1),
cbind(Longitude, Latitude)
) * 0.000621371, # 0.000621371 miles = 1 meter
# accounting for potential negative times
TravelTime_Sec = as.numeric(ifelse(Event_Time > lag(Departure_Time),
Event_Time - lag(Departure_Time),
NA
)
),
TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
# accounting for potential negative or zero travel times
SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
TravelDistance_Mi / TravelTime_Hr,
NA
),
Start_ID = lag(StopID_Clean),
Start_Desc = lag(Stop_Desc),
StartStop_ID = ifelse(is.na(Start_ID),
paste("NULL", StopID_Clean, sep = "--"),
paste(Start_ID, StopID_Clean, sep = "--")
)
) %>%
as.data.frame()
Inspecting for issues with StartStop_ID (where the value is either NA or contains NULL). They ONLY exist when BusDay_EventNum = 1 (which is by design). So everything looks OK.
View(group_by(AllDays_BusDay,
StartStop_ID
) %>%
summarise(
Cnt = n()
) %>%
arrange(desc(Cnt)
)
)
View(filter(AllDays_BusDay,
(is.na(StartStop_ID) |
str_detect(StartStop_ID, "NULL")
) &
BusDay_EventNum != 1
)
)
Stats for StartStop_ID.
Quantiles_SS_dt <- group_by(AllDays_BusDay,
StartStop_ID
) %>%
mutate(TD_Mi_SS_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
TD_Mi_SS_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
TT_Sec_SS_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
TT_Sec_SS_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
TT_Hr_SS_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
TT_Hr_SS_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
) %>%
data.table()
Stats for StartStop_ID with Event_Time_HrGroup.
Quantiles_SSHG_dt <- group_by(Stats_StSt,
StartStop_ID,
Event_Time_HrGroup
) %>%
mutate(TD_Mi_SSHG_q5 = quantile(x = TravelDistance_Mi, probs = 0.05, na.rm = TRUE),
TD_Mi_SSHG_q95 = quantile(x = TravelDistance_Mi, probs = 0.95, na.rm = TRUE),
TT_Sec_SSHG_q5 = quantile(x = TravelTime_Sec, probs = 0.05, na.rm = TRUE),
TT_Sec_SSHG_q95 = quantile(x = TravelTime_Sec, probs = 0.95, na.rm = TRUE),
TT_Hr_SSHG_q5 = quantile(x = TravelTime_Hr, probs = 0.05, na.rm = TRUE),
TT_Hr_SSHG_q95 = quantile(x = TravelTime_Hr, probs = 0.95, na.rm = TRUE)
) %>%
data.table()
Feature engineering.
Creating a BusEventRoute row number, and a RouteAlt_Lag1 indicator for future identification purposes.
AllDays_BusDayRoute <- group_by(Stats_StSt_HrGrp,
Bus_ID,
Event_Time_Date,
Route
) %>%
mutate(RouteAlt_Lag1 = lag(RouteAlt) # used in future analyses to identify Route changes
# Odometer_Distance_Lag1 = lag(Odometer_Distance),
#
# # accounting for potential negative distances
# TravelDistance_Ft = ifelse(Odometer_Distance >= Odometer_Distance_Lag1,
# Odometer_Distance - Odometer_Distance_Lag1,
# NA
# ),
# TravelDistance_Mi = TravelDistance_Ft / 5280, #5,280 feet in 1 mile
#
# # accounting for potential negative times
# TravelTime_Sec = as.numeric(ifelse(Event_Time >= lag(Departure_Time),
# Event_Time - lag(Departure_Time),
# NA
# )
# ),
# TravelTime_Hr = TravelTime_Sec / 3600, # 3,600 seconds in 1 hour
#
# # accounting for potential negative or zero travel times
# SpeedAvg_Mph = ifelse(TravelTime_Hr > 0,
# TravelDistance_Mi / TravelTime_Hr,
# NA
# )
) %>%
data.frame()
str(AllDays_BusDayRoute)
'data.frame': 2809529 obs. of 93 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Odometer_Distance_Lag1: int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Latitude_L1 : num NA 38.8 38.8 38.8 38.8 ...
$ Longitude_L1 : num NA -77.2 -77.2 -77.2 -77.2 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs: num NA 0.15 0.105 0.165 0.832 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ SpeedAvg_Mph : num NA 6.05 23.57 100.83 3.44 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ TD_Mi_SS_q5 : num NA 0.0252 0.2422 0.7324 0.0794 ...
$ TD_Mi_SS_q95 : num NA 0.626 0.242 1.008 0.176 ...
$ TT_Sec_SS_q5 : num NA 11.9 37 30.5 172.9 ...
$ TT_Sec_SS_q95 : num NA 346.3 37 75.8 189.1 ...
$ TT_Hr_SS_q5 : num NA 0.00331 0.01028 0.00849 0.04803 ...
$ TT_Hr_SS_q95 : num NA 0.0962 0.0103 0.0211 0.0525 ...
$ TD_Mi_SS_Mean : num NaN 0.437 0.242 0.908 0.128 ...
$ TD_Mi_SS_Mean_F : num NaN 0.457 0.242 0.977 NaN ...
$ TD_Mi_SS_Med : num NA 0.512 0.242 0.962 0.128 ...
$ TD_Mi_SS_Med_F : num NA 0.512 0.242 1.008 NA ...
$ TD_Mi_SS_Cnt : int 0 14 1 4 2 87 22 118 91 11 ...
$ TD_Mi_SS_Cnt_F : int 0 12 1 3 0 77 18 106 81 9 ...
$ TT_Sec_SS_Mean : num NaN 215.8 37 58.2 181 ...
$ TT_Sec_SS_Mean_F : num NaN 218.9 37 65.5 NaN ...
$ TT_Sec_SS_Med : num NA 223.5 37 65.5 181 ...
$ TT_Sec_SS_Med_F : num NA 223.5 37 65.5 NA ...
$ TT_Sec_SS_Cnt : int 0 14 1 4 2 173 22 141 141 11 ...
$ TT_Sec_SS_Cnt_F : int 0 12 1 2 0 156 18 127 128 9 ...
$ TT_Hr_SS_Mean : num NaN 0.0599 0.0103 0.0162 0.0503 ...
$ TT_Hr_SS_Mean_F : num NaN 0.0608 0.0103 0.0182 NaN ...
$ TT_Hr_SS_Med : num NA 0.0621 0.0103 0.0182 0.0503 ...
$ TT_Hr_SS_Med_F : num NA 0.0621 0.0103 0.0182 NA ...
$ TT_Hr_SS_Cnt : int 0 14 1 4 2 173 22 141 141 11 ...
$ TT_Hr_SS_Cnt_F : int 0 12 1 2 0 156 18 127 128 9 ...
$ TD_Mi_SSHG_q5 : num NA 0.0996 0.2422 0.7002 0.1816 ...
$ TD_Mi_SSHG_q95 : num NA 0.627 0.242 0.7 0.182 ...
$ TT_Sec_SSHG_q5 : num NA 59.6 37 25 190 11.6 236 51.5 55 8.8 ...
$ TT_Sec_SSHG_q95 : num NA 276 37 25 190 ...
$ TT_Hr_SSHG_q5 : num NA 0.01656 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_q95 : num NA 0.07653 0.01028 0.00694 0.05278 ...
$ TD_Mi_SSHG_Mean : num NaN 0.442 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Mean_F : num NaN 0.491 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Med : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Med_F : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Cnt : int 0 7 1 1 1 23 6 29 28 3 ...
$ TD_Mi_SSHG_Cnt_F : int 0 5 1 1 1 19 4 25 24 1 ...
$ TT_Sec_SSHG_Mean : num NaN 202 37 25 190 ...
$ TT_Sec_SSHG_Mean_F : num NaN 226 37 25 190 ...
$ TT_Sec_SSHG_Med : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_SSHG_Med_F : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_SSHG_Cnt : int 0 7 1 1 1 35 6 36 35 3 ...
$ TT_Sec_SSHG_Cnt_F : int 0 5 1 1 1 31 4 32 32 1 ...
$ TT_Hr_SSHG_Mean : num NaN 0.05615 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Mean_F : num NaN 0.06278 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Med : num NA 0.06083 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Med_F : num NA 0.06083 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Cnt : int 0 7 1 1 1 35 6 36 35 3 ...
$ TT_Hr_SSHG_Cnt_F : int 0 5 1 1 1 31 4 32 28 1 ...
$ RouteAlt_Lag1 : Factor w/ 14 levels "1","10","11",..: NA 1 1 1 1 1 1 6 6 6 ...
Feature engineering.
Calculating a variable to know if the RouteAlt changed. Could be useful in helping identifying weirdness in calculated distances and speeds.
rm(Stats_StSt_HrGrp)
AllDays_DirChange <- AllDays_BusDayRoute %>%
mutate(DirChange = ifelse(RouteAlt == RouteAlt_Lag1,
"Same",
"Change"
),
DirChange2 = factor(ifelse(is.na(DirChange),
"Change",
DirChange
)
)
)
str(AllDays_DirChange)
'data.frame': 2809529 obs. of 95 variables:
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Stop_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ StopID_New : int NA NA NA NA NA NA NA NA NA NA ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Odometer_Distance_Lag1: int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Latitude_L1 : num NA 38.8 38.8 38.8 38.8 ...
$ Longitude_L1 : num NA -77.2 -77.2 -77.2 -77.2 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs: num NA 0.15 0.105 0.165 0.832 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ SpeedAvg_Mph : num NA 6.05 23.57 100.83 3.44 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ TD_Mi_SS_q5 : num NA 0.0252 0.2422 0.7324 0.0794 ...
$ TD_Mi_SS_q95 : num NA 0.626 0.242 1.008 0.176 ...
$ TT_Sec_SS_q5 : num NA 11.9 37 30.5 172.9 ...
$ TT_Sec_SS_q95 : num NA 346.3 37 75.8 189.1 ...
$ TT_Hr_SS_q5 : num NA 0.00331 0.01028 0.00849 0.04803 ...
$ TT_Hr_SS_q95 : num NA 0.0962 0.0103 0.0211 0.0525 ...
$ TD_Mi_SS_Mean : num NaN 0.437 0.242 0.908 0.128 ...
$ TD_Mi_SS_Mean_F : num NaN 0.457 0.242 0.977 NaN ...
$ TD_Mi_SS_Med : num NA 0.512 0.242 0.962 0.128 ...
$ TD_Mi_SS_Med_F : num NA 0.512 0.242 1.008 NA ...
$ TD_Mi_SS_Cnt : int 0 14 1 4 2 87 22 118 91 11 ...
$ TD_Mi_SS_Cnt_F : int 0 12 1 3 0 77 18 106 81 9 ...
$ TT_Sec_SS_Mean : num NaN 215.8 37 58.2 181 ...
$ TT_Sec_SS_Mean_F : num NaN 218.9 37 65.5 NaN ...
$ TT_Sec_SS_Med : num NA 223.5 37 65.5 181 ...
$ TT_Sec_SS_Med_F : num NA 223.5 37 65.5 NA ...
$ TT_Sec_SS_Cnt : int 0 14 1 4 2 173 22 141 141 11 ...
$ TT_Sec_SS_Cnt_F : int 0 12 1 2 0 156 18 127 128 9 ...
$ TT_Hr_SS_Mean : num NaN 0.0599 0.0103 0.0162 0.0503 ...
$ TT_Hr_SS_Mean_F : num NaN 0.0608 0.0103 0.0182 NaN ...
$ TT_Hr_SS_Med : num NA 0.0621 0.0103 0.0182 0.0503 ...
$ TT_Hr_SS_Med_F : num NA 0.0621 0.0103 0.0182 NA ...
$ TT_Hr_SS_Cnt : int 0 14 1 4 2 173 22 141 141 11 ...
$ TT_Hr_SS_Cnt_F : int 0 12 1 2 0 156 18 127 128 9 ...
$ TD_Mi_SSHG_q5 : num NA 0.0996 0.2422 0.7002 0.1816 ...
$ TD_Mi_SSHG_q95 : num NA 0.627 0.242 0.7 0.182 ...
$ TT_Sec_SSHG_q5 : num NA 59.6 37 25 190 11.6 236 51.5 55 8.8 ...
$ TT_Sec_SSHG_q95 : num NA 276 37 25 190 ...
$ TT_Hr_SSHG_q5 : num NA 0.01656 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_q95 : num NA 0.07653 0.01028 0.00694 0.05278 ...
$ TD_Mi_SSHG_Mean : num NaN 0.442 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Mean_F : num NaN 0.491 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Med : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Med_F : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Cnt : int 0 7 1 1 1 23 6 29 28 3 ...
$ TD_Mi_SSHG_Cnt_F : int 0 5 1 1 1 19 4 25 24 1 ...
$ TT_Sec_SSHG_Mean : num NaN 202 37 25 190 ...
$ TT_Sec_SSHG_Mean_F : num NaN 226 37 25 190 ...
$ TT_Sec_SSHG_Med : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_SSHG_Med_F : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_SSHG_Cnt : int 0 7 1 1 1 35 6 36 35 3 ...
$ TT_Sec_SSHG_Cnt_F : int 0 5 1 1 1 31 4 32 32 1 ...
$ TT_Hr_SSHG_Mean : num NaN 0.05615 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Mean_F : num NaN 0.06278 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Med : num NA 0.06083 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Med_F : num NA 0.06083 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Cnt : int 0 7 1 1 1 35 6 36 35 3 ...
$ TT_Hr_SSHG_Cnt_F : int 0 5 1 1 1 31 4 32 28 1 ...
$ RouteAlt_Lag1 : Factor w/ 14 levels "1","10","11",..: NA 1 1 1 1 1 1 6 6 6 ...
$ DirChange : chr NA "Same" "Same" "Same" ...
$ DirChange2 : Factor w/ 2 levels "Change","Same": 1 2 2 2 2 2 1 2 2 2 ...
Re-ordering the variables to ease with comprehension.
rm(AllDays_BusDayRoute)
AllDays_NewOrder <- select(AllDays_DirChange,
RowNum_OG,
group,
StartStop_ID,
BusDay_EventNum,
Bus_ID,
Route,
RouteAlt,
# RouteAlt_Lag1,
DirChange2,
Route_Direction,
Stop_Sequence,
Start_ID,
Start_Desc,
# Stop_ID,
StopID_Clean,
StopID_Indicator,
Stop_Desc,
Event_Type,
Event_Description,
Event_Time_Yr,
Event_Time_Mth,
Event_Time_Date,
Event_Time_Day,
Event_Time_Hr,
Event_Time_HrGroup,
Event_Time_Min,
Event_Time,
Departure_Time,
Dwell_Time,
Dwell_Time2,
Delta_Time,
Latitude,
Longitude,
Heading,
Odometer_Distance,
Odometer_Distance_Lag1,
Odometer_Distance_Mi,
TravelDistance_Ft,
TravelDistance_Mi,
TravelDistance_Mi_Hvrs,
TD_Mi_SS_q5,
TD_Mi_SS_q95,
TD_Mi_SSHG_q5,
TD_Mi_SSHG_q95,
TD_Mi_SS_Mean,
TD_Mi_SS_Mean_F,
TD_Mi_SSHG_Mean,
TD_Mi_SSHG_Mean_F,
TD_Mi_SS_Med,
TD_Mi_SS_Med_F,
TD_Mi_SSHG_Med,
TD_Mi_SSHG_Med_F,
TD_Mi_SS_Cnt,
TD_Mi_SS_Cnt_F,
TD_Mi_SSHG_Cnt,
TD_Mi_SSHG_Cnt_F,
TravelTime_Sec,
TT_Sec_SS_q5,
TT_Sec_SS_q95,
TT_Sec_SSHG_q5,
TT_Sec_SSHG_q95,
TT_Sec_SS_Mean,
TT_Sec_SS_Mean_F,
TT_Sec_SSHG_Mean,
TT_Sec_SSHG_Mean_F,
TT_Sec_SS_Med,
TT_Sec_SS_Med_F,
TT_Sec_SSHG_Med,
TT_Sec_SSHG_Med_F,
TT_Sec_SS_Cnt,
TT_Sec_SS_Cnt_F,
TT_Sec_SSHG_Cnt,
TT_Sec_SSHG_Cnt_F,
TravelTime_Hr,
TT_Hr_SS_q5,
TT_Hr_SS_q95,
TT_Hr_SSHG_q5,
TT_Hr_SSHG_q95,
TT_Hr_SS_Mean,
TT_Hr_SS_Mean_F,
TT_Hr_SSHG_Mean,
TT_Hr_SSHG_Mean_F,
TT_Hr_SS_Med,
TT_Hr_SS_Med_F,
TT_Hr_SSHG_Med,
TT_Hr_SSHG_Med_F,
TT_Hr_SS_Cnt,
TT_Hr_SS_Cnt_F,
TT_Hr_SSHG_Cnt,
TT_Hr_SSHG_Cnt_F,
SpeedAvg_Mph
)
rm(AllDays_DirChange)
str(AllDays_NewOrder)
'data.frame': 2809529 obs. of 89 variables:
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ DirChange2 : Factor w/ 2 levels "Change","Same": 1 2 2 2 2 2 1 2 2 2 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Odometer_Distance_Lag1: int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs: num NA 0.15 0.105 0.165 0.832 ...
$ TD_Mi_SS_q5 : num NA 0.0252 0.2422 0.7324 0.0794 ...
$ TD_Mi_SS_q95 : num NA 0.626 0.242 1.008 0.176 ...
$ TD_Mi_SSHG_q5 : num NA 0.0996 0.2422 0.7002 0.1816 ...
$ TD_Mi_SSHG_q95 : num NA 0.627 0.242 0.7 0.182 ...
$ TD_Mi_SS_Mean : num NaN 0.437 0.242 0.908 0.128 ...
$ TD_Mi_SS_Mean_F : num NaN 0.457 0.242 0.977 NaN ...
$ TD_Mi_SSHG_Mean : num NaN 0.442 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Mean_F : num NaN 0.491 0.242 0.7 0.182 ...
$ TD_Mi_SS_Med : num NA 0.512 0.242 0.962 0.128 ...
$ TD_Mi_SS_Med_F : num NA 0.512 0.242 1.008 NA ...
$ TD_Mi_SSHG_Med : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Med_F : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_SS_Cnt : int 0 14 1 4 2 87 22 118 91 11 ...
$ TD_Mi_SS_Cnt_F : int 0 12 1 3 0 77 18 106 81 9 ...
$ TD_Mi_SSHG_Cnt : int 0 7 1 1 1 23 6 29 28 3 ...
$ TD_Mi_SSHG_Cnt_F : int 0 5 1 1 1 19 4 25 24 1 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TT_Sec_SS_q5 : num NA 11.9 37 30.5 172.9 ...
$ TT_Sec_SS_q95 : num NA 346.3 37 75.8 189.1 ...
$ TT_Sec_SSHG_q5 : num NA 59.6 37 25 190 11.6 236 51.5 55 8.8 ...
$ TT_Sec_SSHG_q95 : num NA 276 37 25 190 ...
$ TT_Sec_SS_Mean : num NaN 215.8 37 58.2 181 ...
$ TT_Sec_SS_Mean_F : num NaN 218.9 37 65.5 NaN ...
$ TT_Sec_SSHG_Mean : num NaN 202 37 25 190 ...
$ TT_Sec_SSHG_Mean_F : num NaN 226 37 25 190 ...
$ TT_Sec_SS_Med : num NA 223.5 37 65.5 181 ...
$ TT_Sec_SS_Med_F : num NA 223.5 37 65.5 NA ...
$ TT_Sec_SSHG_Med : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_SSHG_Med_F : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_SS_Cnt : int 0 14 1 4 2 173 22 141 141 11 ...
$ TT_Sec_SS_Cnt_F : int 0 12 1 2 0 156 18 127 128 9 ...
$ TT_Sec_SSHG_Cnt : int 0 7 1 1 1 35 6 36 35 3 ...
$ TT_Sec_SSHG_Cnt_F : int 0 5 1 1 1 31 4 32 32 1 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ TT_Hr_SS_q5 : num NA 0.00331 0.01028 0.00849 0.04803 ...
$ TT_Hr_SS_q95 : num NA 0.0962 0.0103 0.0211 0.0525 ...
$ TT_Hr_SSHG_q5 : num NA 0.01656 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_q95 : num NA 0.07653 0.01028 0.00694 0.05278 ...
$ TT_Hr_SS_Mean : num NaN 0.0599 0.0103 0.0162 0.0503 ...
$ TT_Hr_SS_Mean_F : num NaN 0.0608 0.0103 0.0182 NaN ...
$ TT_Hr_SSHG_Mean : num NaN 0.05615 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Mean_F : num NaN 0.06278 0.01028 0.00694 0.05278 ...
$ TT_Hr_SS_Med : num NA 0.0621 0.0103 0.0182 0.0503 ...
$ TT_Hr_SS_Med_F : num NA 0.0621 0.0103 0.0182 NA ...
$ TT_Hr_SSHG_Med : num NA 0.06083 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Med_F : num NA 0.06083 0.01028 0.00694 0.05278 ...
$ TT_Hr_SS_Cnt : int 0 14 1 4 2 173 22 141 141 11 ...
$ TT_Hr_SS_Cnt_F : int 0 12 1 2 0 156 18 127 128 9 ...
$ TT_Hr_SSHG_Cnt : int 0 7 1 1 1 35 6 36 35 3 ...
$ TT_Hr_SSHG_Cnt_F : int 0 5 1 1 1 31 4 32 28 1 ...
$ SpeedAvg_Mph : num NA 6.05 23.57 100.83 3.44 ...
# View(head(AllDays_NewOrder, 500))
View(tail(AllDays_NewOrder, 500))
Summarizing the data to help spot anomolies.
summary(AllDays_NewOrder)
RowNum_OG group StartStop_ID BusDay_EventNum Bus_ID
Min. : 1 1:559521 Length:2809529 Min. : 1.0 Min. : 11
1st Qu.: 784722 2:561389 Class :character 1st Qu.: 113.0 1st Qu.:2922
Median :1563300 3:567794 Mode :character Median : 248.0 Median :6195
Mean :1562504 4:559180 Mean : 290.5 Mean :5382
3rd Qu.:2337981 5:561645 3rd Qu.: 428.0 3rd Qu.:7104
Max. :3119443 Max. :1344.0 Max. :8105
Route RouteAlt DirChange2 Route_Direction
Length:2809529 2 :1128810 Change: 67071 SOUTH :739235
Class :character 1 :1065425 Same :2742458 NORTH :735203
Mode :character 3 : 260372 WEST :649706
4 : 130801 EAST :628074
5 : 75039 LOOP : 35611
6 : 56408 CLOCKWIS: 10671
(Other): 92674 (Other) : 11029
Stop_Sequence Start_ID Start_Desc StopID_Clean
Min. : 1.00 Length:2809529 Length:2809529 Length:2809529
1st Qu.: 12.00 Class :character Class :character Class :character
Median : 24.00 Mode :character Mode :character Mode :character
Mean : 26.83
3rd Qu.: 39.00
Max. :104.00
StopID_Indicator Stop_Desc Event_Type
ID_Bad: 18948 Length:2809529 Min. :3.0
ID_OK :2790581 Class :character 1st Qu.:3.0
Mode :character Median :4.0
Mean :3.6
3rd Qu.:4.0
Max. :5.0
Event_Description Event_Time_Yr
Serviced Stop :1127366 Min. :2016
Unknown Stop : 2579 1st Qu.:2016
UnServiced Stop :1679584 Median :2016
Mean :2016
3rd Qu.:2016
Max. :2016
Event_Time_Mth Event_Time_Date Event_Time_Day Event_Time_Hr Event_Time_HrGroup
Min. :10 Min. :3.000 Sun : 0 Min. : 0.00 Group6_8 :611612
1st Qu.:10 1st Qu.:4.000 Mon :559521 1st Qu.: 8.00 Group15_17:560103
Median :10 Median :5.000 Tues :561389 Median :13.00 Group18_20:461056
Mean :10 Mean :5.001 Wed :567794 Mean :12.97 Group9_11 :396514
3rd Qu.:10 3rd Qu.:6.000 Thurs:559180 3rd Qu.:18.00 Group12_14:353603
Max. :10 Max. :7.000 Fri :561645 Max. :23.00 Group21_23:244522
Sat : 0 (Other) :182119
Event_Time_Min Event_Time Departure_Time
Min. : 0.00 Min. :2016-10-03 00:00:00 Min. :2016-10-03 00:00:00
1st Qu.:14.00 1st Qu.:2016-10-04 08:36:14 1st Qu.:2016-10-04 08:36:20
Median :29.00 Median :2016-10-05 13:49:29 Median :2016-10-05 13:49:38
Mean :29.43 Mean :2016-10-05 13:29:21 Mean :2016-10-05 13:29:28
3rd Qu.:44.00 3rd Qu.:2016-10-06 17:58:06 3rd Qu.:2016-10-06 17:58:13
Max. :59.00 Max. :2016-10-07 23:59:59 Max. :2016-10-08 00:12:31
Dwell_Time Dwell_Time2 Delta_Time Latitude
Min. : 0.00 Min. : 0.000 Min. :-5606.0 Min. : 0.00
1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 14.0 1st Qu.:38.86
Median : 0.00 Median : 0.000 Median : 157.0 Median :38.90
Mean : 12.56 Mean : 6.359 Mean : 268.8 Mean :38.91
3rd Qu.: 5.00 3rd Qu.: 4.000 3rd Qu.: 396.0 3rd Qu.:38.96
Max. :6205.00 Max. :6205.000 Max. : 9426.0 Max. :39.19
Longitude Heading Odometer_Distance Odometer_Distance_Lag1
Min. :-77.45 Min. : 0.0 Min. : 0 Min. : 0
1st Qu.:-77.07 1st Qu.: 89.0 1st Qu.: 177595 1st Qu.: 177326
Median :-77.02 Median :180.0 Median : 377510 Median : 376934
Mean :-77.02 Mean :176.9 Mean : 426254 Mean : 425713
3rd Qu.:-76.97 3rd Qu.:269.0 3rd Qu.: 623667 3rd Qu.: 622879
Max. : 0.00 Max. :360.0 Max. :11108034 Max. :10853226
NA's :6528
Odometer_Distance_Mi TravelDistance_Ft TravelDistance_Mi TravelDistance_Mi_Hvrs
Min. : 0.00 Min. : 1 Min. : 0.0 Min. : 0.000
1st Qu.: 33.64 1st Qu.: 699 1st Qu.: 0.1 1st Qu.: 0.106
Median : 71.50 Median : 1044 Median : 0.2 Median : 0.142
Mean : 80.73 Mean : 1624 Mean : 0.3 Mean : 0.201
3rd Qu.: 118.12 3rd Qu.: 1518 3rd Qu.: 0.3 3rd Qu.: 0.193
Max. :2103.79 Max. :1323464 Max. :250.7 Max. :24.407
NA's :322734 NA's :322734 NA's :6528
TD_Mi_SS_q5 TD_Mi_SS_q95 TD_Mi_SSHG_q5 TD_Mi_SSHG_q95
Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.00
1st Qu.: 0.086 1st Qu.: 0.262 1st Qu.: 0.09 1st Qu.: 0.25
Median : 0.104 Median : 0.326 Median : 0.11 Median : 0.31
Mean : 0.164 Mean : 0.488 Mean : 0.18 Mean : 0.47
3rd Qu.: 0.139 3rd Qu.: 0.436 3rd Qu.: 0.15 3rd Qu.: 0.42
Max. :219.163 Max. :246.949 Max. :250.66 Max. :250.66
NA's :24757 NA's :24757 NA's :35629 NA's :35629
TD_Mi_SS_Mean TD_Mi_SS_Mean_F TD_Mi_SSHG_Mean TD_Mi_SSHG_Mean_F
Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.00
1st Qu.: 0.172 1st Qu.: 0.166 1st Qu.: 0.17 1st Qu.: 0.16
Median : 0.212 Median : 0.207 Median : 0.21 Median : 0.21
Mean : 0.307 Mean : 0.291 Mean : 0.31 Mean : 0.29
3rd Qu.: 0.267 3rd Qu.: 0.260 3rd Qu.: 0.27 3rd Qu.: 0.26
Max. :219.163 Max. :219.163 Max. :250.66 Max. :250.66
NA's :24757 NA's :27919 NA's :35629 NA's :44458
TD_Mi_SS_Med TD_Mi_SS_Med_F TD_Mi_SSHG_Med TD_Mi_SSHG_Med_F
Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.00
1st Qu.: 0.146 1st Qu.: 0.146 1st Qu.: 0.14 1st Qu.: 0.14
Median : 0.196 Median : 0.196 Median : 0.20 Median : 0.20
Mean : 0.288 Mean : 0.282 Mean : 0.29 Mean : 0.28
3rd Qu.: 0.265 3rd Qu.: 0.265 3rd Qu.: 0.27 3rd Qu.: 0.27
Max. :219.163 Max. :219.163 Max. :250.66 Max. :250.66
NA's :24757 NA's :27919 NA's :35629 NA's :44458
TD_Mi_SS_Cnt TD_Mi_SS_Cnt_F TD_Mi_SSHG_Cnt TD_Mi_SSHG_Cnt_F
Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.00
1st Qu.: 163.0 1st Qu.: 146.0 1st Qu.: 26.00 1st Qu.: 22.00
Median : 280.0 Median : 252.0 Median : 45.00 Median : 39.00
Mean : 347.4 Mean : 312.7 Mean : 57.27 Mean : 50.85
3rd Qu.: 456.0 3rd Qu.: 411.0 3rd Qu.: 75.00 3rd Qu.: 67.00
Max. :1543.0 Max. :1388.0 Max. :663.00 Max. :595.00
TravelTime_Sec TT_Sec_SS_q5 TT_Sec_SS_q95 TT_Sec_SSHG_q5
Min. : 1.0 Min. : 1.00 Min. : 1.00 Min. : 1.00
1st Qu.: 25.0 1st Qu.: 15.00 1st Qu.: 48.00 1st Qu.: 16.00
Median : 39.0 Median : 22.00 Median : 80.05 Median : 23.40
Mean : 104.9 Mean : 61.26 Mean : 183.28 Mean : 67.33
3rd Qu.: 72.0 3rd Qu.: 34.00 3rd Qu.: 134.60 3rd Qu.: 36.70
Max. :60750.0 Max. :60750.00 Max. :60750.00 Max. :60750.00
NA's :6641 NA's :6531 NA's :6531 NA's :6535
TT_Sec_SSHG_q95 TT_Sec_SS_Mean TT_Sec_SS_Mean_F TT_Sec_SSHG_Mean
Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. : 1.00
1st Qu.: 43.80 1st Qu.: 29.06 1st Qu.: 27.54 1st Qu.: 28.38
Median : 72.95 Median : 44.16 Median : 41.91 Median : 43.38
Mean : 169.21 Mean : 104.88 Mean : 91.34 Mean : 104.88
3rd Qu.: 123.65 3rd Qu.: 73.30 3rd Qu.: 69.25 3rd Qu.: 72.93
Max. :60750.00 Max. :60750.00 Max. :60750.00 Max. :60750.00
NA's :6535 NA's :6531 NA's :10519 NA's :6535
TT_Sec_SSHG_Mean_F TT_Sec_SS_Med TT_Sec_SS_Med_F TT_Sec_SSHG_Med
Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. : 1.00
1st Qu.: 27.21 1st Qu.: 26.00 1st Qu.: 26.00 1st Qu.: 26.00
Median : 41.48 Median : 39.00 Median : 39.00 Median : 39.00
Mean : 93.53 Mean : 91.55 Mean : 84.82 Mean : 94.94
3rd Qu.: 70.12 3rd Qu.: 65.00 3rd Qu.: 65.00 3rd Qu.: 67.00
Max. :60750.00 Max. :60750.00 Max. :60750.00 Max. :60750.00
NA's :12811 NA's :6531 NA's :10519 NA's :6535
TT_Sec_SSHG_Med_F TT_Sec_SS_Cnt TT_Sec_SS_Cnt_F TT_Sec_SSHG_Cnt
Min. : 1.00 Min. : 0.0 Min. : 0.0 Min. : 0.00
1st Qu.: 26.00 1st Qu.: 194.0 1st Qu.: 177.0 1st Qu.: 29.00
Median : 38.50 Median : 310.0 Median : 282.0 Median : 51.00
Mean : 88.44 Mean : 384.4 Mean : 349.8 Mean : 63.46
3rd Qu.: 66.50 3rd Qu.: 497.0 3rd Qu.: 452.0 3rd Qu.: 83.00
Max. :60750.00 Max. :1664.0 Max. :1523.0 Max. :691.00
NA's :12811
TT_Sec_SSHG_Cnt_F TravelTime_Hr TT_Hr_SS_q5 TT_Hr_SS_q95
Min. : 0.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.: 26.00 1st Qu.: 0.007 1st Qu.: 0.004 1st Qu.: 0.013
Median : 46.00 Median : 0.011 Median : 0.006 Median : 0.022
Mean : 57.09 Mean : 0.029 Mean : 0.017 Mean : 0.051
3rd Qu.: 74.00 3rd Qu.: 0.020 3rd Qu.: 0.009 3rd Qu.: 0.037
Max. :634.00 Max. :16.875 Max. :16.875 Max. :16.875
NA's :6641 NA's :6531 NA's :6531
TT_Hr_SSHG_q5 TT_Hr_SSHG_q95 TT_Hr_SS_Mean TT_Hr_SS_Mean_F
Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.: 0.004 1st Qu.: 0.012 1st Qu.: 0.008 1st Qu.: 0.008
Median : 0.006 Median : 0.020 Median : 0.012 Median : 0.012
Mean : 0.019 Mean : 0.047 Mean : 0.029 Mean : 0.025
3rd Qu.: 0.010 3rd Qu.: 0.034 3rd Qu.: 0.020 3rd Qu.: 0.019
Max. :16.875 Max. :16.875 Max. :16.875 Max. :16.875
NA's :6535 NA's :6535 NA's :6531 NA's :10532
TT_Hr_SSHG_Mean TT_Hr_SSHG_Mean_F TT_Hr_SS_Med TT_Hr_SS_Med_F
Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. : 0.000
1st Qu.: 0.008 1st Qu.: 0.008 1st Qu.: 0.007 1st Qu.: 0.007
Median : 0.012 Median : 0.012 Median : 0.011 Median : 0.011
Mean : 0.029 Mean : 0.026 Mean : 0.025 Mean : 0.024
3rd Qu.: 0.020 3rd Qu.: 0.019 3rd Qu.: 0.018 3rd Qu.: 0.018
Max. :16.875 Max. :16.875 Max. :16.875 Max. :16.875
NA's :6535 NA's :12895 NA's :6531 NA's :10532
TT_Hr_SSHG_Med TT_Hr_SSHG_Med_F TT_Hr_SS_Cnt TT_Hr_SS_Cnt_F
Min. : 0.000 Min. : 0.000 Min. : 0.0 Min. : 0.0
1st Qu.: 0.007 1st Qu.: 0.007 1st Qu.: 194.0 1st Qu.: 176.0
Median : 0.011 Median : 0.011 Median : 310.0 Median : 282.0
Mean : 0.026 Mean : 0.025 Mean : 384.4 Mean : 349.6
3rd Qu.: 0.019 3rd Qu.: 0.018 3rd Qu.: 497.0 3rd Qu.: 452.0
Max. :16.875 Max. :16.875 Max. :1664.0 Max. :1523.0
NA's :6535 NA's :12895
TT_Hr_SSHG_Cnt TT_Hr_SSHG_Cnt_F SpeedAvg_Mph
Min. : 0.00 Min. : 0.00 Min. : 0.0
1st Qu.: 29.00 1st Qu.: 26.00 1st Qu.: 10.1
Median : 51.00 Median : 46.00 Median : 16.7
Mean : 63.46 Mean : 57.05 Mean : 26.5
3rd Qu.: 83.00 3rd Qu.: 74.00 3rd Qu.: 31.2
Max. :691.00 Max. :634.00 Max. :22924.1
NA's :322762
Investigation of TravelDistance_Mi.
View(TravDistMi_Pctiles): 99% of TravelDistance_Mi are about 1 mile or less…but some weird TravelDistance_Mi values (e.g., 584 miles traveled) exist.
Investigation of TravelDistance_Mi.
Why are some TravelDistance_Mi “NA”? It looks like partially because the records are the first trip of the day (for that bus), so I purposefully set the distance to “NA”. Another reason is due to the odometer recording a value less than the previous odometer recording. In most cases, I have no explanation for this - though I have observed about 67% of all instances where TravelDistance_Mi is NA (other than because it’s the first record of the day) are instances where DirChange2 is “Change”. This is weird and should be asked to WMATA.
Investigation of TravelDistance_Mi.
These records are NA becuase the current record odometer is less than the previous record odometer. Theoretically, this should NOT happen. Me: it appears that about 67% of all instances where TravelDistance_Mi is NA (other than because it’s th first record of the day) are instances where DirChange2 is “Change”. This is weird and should be asked to WMATA.
prop.table(as.table(as.matrix(TestTable_Spread)
),
1
)
False True
Change 0.8298069 0.1701931
Same 0.8884570 0.1115430
prop.table(as.table(as.matrix(TestTable_Spread)
),
2
)
False True
Change 0.02020231 0.03258635
Same 0.97979769 0.96741365
Investigation of TravelDistance_Mi.
Let’s look at just the TravelDistance_Mi values that are NOT “NA”.
str(TravelDistance_Mi_NoNA)
'data.frame': 2486795 obs. of 89 variables:
$ RowNum_OG : int 3 4 5 6 7 9 10 11 12 13 ...
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartStop_ID : chr "5004572--5004573" "5004573--5002210" "5002210--5002209" "5002209--5000070" ...
$ BusDay_EventNum : int 2 3 4 5 6 7 8 9 10 11 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 6 6 6 6 6 ...
$ DirChange2 : Factor w/ 2 levels "Change","Same": 2 2 2 2 2 1 2 2 2 2 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 6 3 2 8 1 2 3 4 2 6 ...
$ Start_ID : chr "5004572" "5004573" "5002210" "5002209" ...
$ Start_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ StopID_Clean : chr "5004573" "5002210" "5002209" "5000070" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Stop_Desc : chr "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" "FRANCONIA-SPRGFLD STA. + BUS BAY D" ...
$ Event_Type : int 4 4 4 3 3 4 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 1 1 3 3 3 3 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Min : int 9 10 10 13 14 21 21 23 23 26 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:09:47" "2016-10-03 06:10:24" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:09:47" "2016-10-03 06:10:24" ...
$ Dwell_Time : int 0 0 0 0 104 0 0 0 0 0 ...
$ Dwell_Time2 : num 0 0 0 0 104 0 0 0 0 0 ...
$ Delta_Time : int 24 165 25 73 719 74 76 63 69 165 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 97 276 15 119 100 274 104 241 274 1 ...
$ Odometer_Distance : int 45139 46418 50115 51074 51303 55633 56163 56285 57262 58363 ...
$ Odometer_Distance_Lag1: int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Odometer_Distance_Mi : num 8.55 8.79 9.49 9.67 9.72 ...
$ TravelDistance_Ft : int 1596 1279 3697 959 229 4330 530 122 977 1101 ...
$ TravelDistance_Mi : num 0.3023 0.2422 0.7002 0.1816 0.0434 ...
$ TravelDistance_Mi_Hvrs: num 0.15 0.105 0.165 0.832 0.068 ...
$ TD_Mi_SS_q5 : num 0.025246 0.242235 0.732434 0.079432 0.000436 ...
$ TD_Mi_SS_q95 : num 0.626 0.242 1.008 0.176 10.435 ...
$ TD_Mi_SSHG_q5 : num 0.09956 0.24223 0.70019 0.18163 0.00269 ...
$ TD_Mi_SSHG_q95 : num 0.627 0.242 0.7 0.182 0.497 ...
$ TD_Mi_SS_Mean : num 0.437 0.242 0.908 0.128 1.166 ...
$ TD_Mi_SS_Mean_F : num 0.457 0.242 0.977 NaN 0.226 ...
$ TD_Mi_SSHG_Mean : num 0.442 0.242 0.7 0.182 0.232 ...
$ TD_Mi_SSHG_Mean_F : num 0.491 0.242 0.7 0.182 0.228 ...
$ TD_Mi_SS_Med : num 0.5116 0.2422 0.9616 0.1278 0.0426 ...
$ TD_Mi_SS_Med_F : num 0.5116 0.2422 1.0081 NA 0.0426 ...
$ TD_Mi_SSHG_Med : num 0.512 0.242 0.7 0.182 0.108 ...
$ TD_Mi_SSHG_Med_F : num 0.512 0.242 0.7 0.182 0.108 ...
$ TD_Mi_SS_Cnt : int 14 1 4 2 87 22 118 91 11 2 ...
$ TD_Mi_SS_Cnt_F : int 12 1 3 0 77 18 106 81 9 0 ...
$ TD_Mi_SSHG_Cnt : int 7 1 1 1 23 6 29 28 3 1 ...
$ TD_Mi_SSHG_Cnt_F : int 5 1 1 1 19 4 25 24 1 1 ...
$ TravelTime_Sec : num 180 37 25 190 29 288 52 76 8 189 ...
$ TT_Sec_SS_q5 : num 11.9 37 30.5 172.9 10 ...
$ TT_Sec_SS_q95 : num 346.3 37 75.8 189.1 1737.2 ...
$ TT_Sec_SSHG_q5 : num 59.6 37 25 190 11.6 236 51.5 55 8.8 189 ...
$ TT_Sec_SSHG_q95 : num 276 37 25 190 675 ...
$ TT_Sec_SS_Mean : num 215.8 37 58.2 181 585.3 ...
$ TT_Sec_SS_Mean_F : num 218.9 37 65.5 NaN 249.3 ...
$ TT_Sec_SSHG_Mean : num 202 37 25 190 257 ...
$ TT_Sec_SSHG_Mean_F : num 226 37 25 190 244 ...
$ TT_Sec_SS_Med : num 223.5 37 65.5 181 33 ...
$ TT_Sec_SS_Med_F : num 223.5 37 65.5 NA 32 ...
$ TT_Sec_SSHG_Med : num 219 37 25 190 134 286 60 65 16 189 ...
$ TT_Sec_SSHG_Med_F : num 219 37 25 190 134 286 60 65 16 189 ...
$ TT_Sec_SS_Cnt : int 14 1 4 2 173 22 141 141 11 2 ...
$ TT_Sec_SS_Cnt_F : int 12 1 2 0 156 18 127 128 9 0 ...
$ TT_Sec_SSHG_Cnt : int 7 1 1 1 35 6 36 35 3 1 ...
$ TT_Sec_SSHG_Cnt_F : int 5 1 1 1 31 4 32 32 1 1 ...
$ TravelTime_Hr : num 0.05 0.01028 0.00694 0.05278 0.00806 ...
$ TT_Hr_SS_q5 : num 0.00331 0.01028 0.00849 0.04803 0.00278 ...
$ TT_Hr_SS_q95 : num 0.0962 0.0103 0.0211 0.0525 0.4826 ...
$ TT_Hr_SSHG_q5 : num 0.01656 0.01028 0.00694 0.05278 0.00322 ...
$ TT_Hr_SSHG_q95 : num 0.07653 0.01028 0.00694 0.05278 0.18739 ...
$ TT_Hr_SS_Mean : num 0.0599 0.0103 0.0162 0.0503 0.1626 ...
$ TT_Hr_SS_Mean_F : num 0.0608 0.0103 0.0182 NaN 0.0692 ...
$ TT_Hr_SSHG_Mean : num 0.05615 0.01028 0.00694 0.05278 0.0715 ...
$ TT_Hr_SSHG_Mean_F : num 0.06278 0.01028 0.00694 0.05278 0.0677 ...
$ TT_Hr_SS_Med : num 0.06208 0.01028 0.01819 0.05028 0.00917 ...
$ TT_Hr_SS_Med_F : num 0.06208 0.01028 0.01819 NA 0.00889 ...
$ TT_Hr_SSHG_Med : num 0.06083 0.01028 0.00694 0.05278 0.03722 ...
$ TT_Hr_SSHG_Med_F : num 0.06083 0.01028 0.00694 0.05278 0.03722 ...
$ TT_Hr_SS_Cnt : int 14 1 4 2 173 22 141 141 11 2 ...
$ TT_Hr_SS_Cnt_F : int 12 1 2 0 156 18 127 128 9 0 ...
$ TT_Hr_SSHG_Cnt : int 7 1 1 1 35 6 36 35 3 1 ...
$ TT_Hr_SSHG_Cnt_F : int 5 1 1 1 31 4 32 28 1 1 ...
$ SpeedAvg_Mph : num 6.05 23.57 100.83 3.44 5.38 ...
summary(TravelDistance_Mi_NoNA)
RowNum_OG group StartStop_ID BusDay_EventNum Bus_ID
Min. : 3 1:496190 Length:2486795 Min. : 2.0 Min. : 11
1st Qu.: 786568 2:497932 Class :character 1st Qu.: 115.0 1st Qu.:2923
Median :1590497 3:501611 Mode :character Median : 251.0 Median :6202
Mean :1578192 4:495069 Mean : 293.2 Mean :5431
3rd Qu.:2351264 5:495993 3rd Qu.: 431.0 3rd Qu.:7113
Max. :3119443 Max. :1344.0 Max. :8105
Route RouteAlt DirChange2 Route_Direction
Length:2486795 2 :994645 Change: 50239 SOUTH :667198
Class :character 1 :943279 Same :2436556 NORTH :662471
Mode :character 3 :229032 WEST :565616
4 :117090 EAST :543386
5 : 67811 LOOP : 33484
6 : 51391 CLOCKWIS: 7012
(Other): 83547 (Other) : 7628
Stop_Sequence Start_ID Start_Desc StopID_Clean
Min. : 1.00 Length:2486795 Length:2486795 Length:2486795
1st Qu.: 12.00 Class :character Class :character Class :character
Median : 24.00 Mode :character Mode :character Mode :character
Mean : 27.13
3rd Qu.: 39.00
Max. :104.00
StopID_Indicator Stop_Desc Event_Type
ID_Bad: 14271 Length:2486795 Min. :3.000
ID_OK :2472524 Class :character 1st Qu.:3.000
Mode :character Median :4.000
Mean :3.626
3rd Qu.:4.000
Max. :5.000
Event_Description Event_Time_Yr
Serviced Stop : 930934 Min. :2016
Unknown Stop : 1794 1st Qu.:2016
UnServiced Stop :1554067 Median :2016
Mean :2016
3rd Qu.:2016
Max. :2016
Event_Time_Mth Event_Time_Date Event_Time_Day Event_Time_Hr Event_Time_HrGroup
Min. :10 Min. :3.000 Sun : 0 Min. : 0.00 Group6_8 :538348
1st Qu.:10 1st Qu.:4.000 Mon :496190 1st Qu.: 8.00 Group15_17:497156
Median :10 Median :5.000 Tues :497932 Median :13.00 Group18_20:408957
Mean :10 Mean :4.999 Wed :501611 Mean :12.99 Group9_11 :351804
3rd Qu.:10 3rd Qu.:6.000 Thurs:495069 3rd Qu.:18.00 Group12_14:314050
Max. :10 Max. :7.000 Fri :495993 Max. :23.00 Group21_23:217259
Sat : 0 (Other) :159221
Event_Time_Min Event_Time Departure_Time
Min. : 0.00 Min. :2016-10-03 00:00:09 Min. :2016-10-03 00:00:09
1st Qu.:14.00 1st Qu.:2016-10-04 08:35:52 1st Qu.:2016-10-04 08:35:59
Median :29.00 Median :2016-10-05 13:46:00 Median :2016-10-05 13:46:06
Mean :29.43 Mean :2016-10-05 13:27:43 Mean :2016-10-05 13:27:49
3rd Qu.:44.00 3rd Qu.:2016-10-06 17:57:32 3rd Qu.:2016-10-06 17:57:39
Max. :59.00 Max. :2016-10-07 23:59:59 Max. :2016-10-08 00:12:31
Dwell_Time Dwell_Time2 Delta_Time Latitude
Min. : 0.00 Min. : 0.000 Min. :-5606.0 Min. : 0.00
1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 16.0 1st Qu.:38.86
Median : 0.00 Median : 0.000 Median : 160.0 Median :38.90
Mean : 11.86 Mean : 5.994 Mean : 274.1 Mean :38.91
3rd Qu.: 4.00 3rd Qu.: 4.000 3rd Qu.: 402.0 3rd Qu.:38.96
Max. :6205.00 Max. :6205.000 Max. : 9426.0 Max. :39.19
Longitude Heading Odometer_Distance Odometer_Distance_Lag1
Min. :-77.45 Min. : 0.0 Min. : 1 Min. : 0
1st Qu.:-77.07 1st Qu.: 89.0 1st Qu.: 200268 1st Qu.: 198635
Median :-77.01 Median :180.0 Median : 394700 Median : 393026
Mean :-77.02 Mean :176.7 Mean : 443225 Mean : 441601
3rd Qu.:-76.97 3rd Qu.:269.0 3rd Qu.: 633936 3rd Qu.: 632313
Max. : 0.00 Max. :360.0 Max. :11108034 Max. :10853226
Odometer_Distance_Mi TravelDistance_Ft TravelDistance_Mi TravelDistance_Mi_Hvrs
Min. : 0.0002 Min. : 1 Min. : 0.00019 Min. : 0.0000
1st Qu.: 37.9295 1st Qu.: 699 1st Qu.: 0.13239 1st Qu.: 0.1034
Median : 74.7538 Median : 1044 Median : 0.19773 Median : 0.1378
Mean : 83.9442 Mean : 1624 Mean : 0.30760 Mean : 0.1918
3rd Qu.: 120.0635 3rd Qu.: 1518 3rd Qu.: 0.28750 3rd Qu.: 0.1828
Max. :2103.7943 Max. :1323464 Max. :250.65606 Max. :24.1507
TD_Mi_SS_q5 TD_Mi_SS_q95 TD_Mi_SSHG_q5 TD_Mi_SSHG_q95
Min. : 0.00019 Min. : 0.00019 Min. : 0.00019 Min. : 0.00019
1st Qu.: 0.08848 1st Qu.: 0.25878 1st Qu.: 0.09167 1st Qu.: 0.24754
Median : 0.10608 Median : 0.32239 Median : 0.11395 Median : 0.31174
Mean : 0.16872 Mean : 0.47949 Mean : 0.18528 Mean : 0.46625
3rd Qu.: 0.13977 3rd Qu.: 0.42822 3rd Qu.: 0.15093 3rd Qu.: 0.41899
Max. :219.16288 Max. :246.94938 Max. :250.65606 Max. :250.65606
TD_Mi_SS_Mean TD_Mi_SS_Mean_F TD_Mi_SSHG_Mean TD_Mi_SSHG_Mean_F
Min. : 0.00019 Min. : 0.0002 Min. : 0.00019 Min. : 0.000
1st Qu.: 0.17129 1st Qu.: 0.1663 1st Qu.: 0.16760 1st Qu.: 0.163
Median : 0.21082 Median : 0.2058 Median : 0.20965 Median : 0.206
Mean : 0.30760 Mean : 0.2916 Mean : 0.30760 Mean : 0.294
3rd Qu.: 0.26422 3rd Qu.: 0.2582 3rd Qu.: 0.26616 3rd Qu.: 0.262
Max. :219.16288 Max. :219.1629 Max. :250.65606 Max. :250.656
NA's :2678 NA's :4904
TD_Mi_SS_Med TD_Mi_SS_Med_F TD_Mi_SSHG_Med TD_Mi_SSHG_Med_F
Min. : 0.00019 Min. : 0.0002 Min. : 0.00019 Min. : 0.000
1st Qu.: 0.14602 1st Qu.: 0.1458 1st Qu.: 0.14403 1st Qu.: 0.144
Median : 0.19470 Median : 0.1947 Median : 0.19527 Median : 0.195
Mean : 0.28931 Mean : 0.2827 Mean : 0.29152 Mean : 0.285
3rd Qu.: 0.26326 3rd Qu.: 0.2633 3rd Qu.: 0.26657 3rd Qu.: 0.266
Max. :219.16288 Max. :219.1629 Max. :250.65606 Max. :250.656
NA's :2678 NA's :4904
TD_Mi_SS_Cnt TD_Mi_SS_Cnt_F TD_Mi_SSHG_Cnt TD_Mi_SSHG_Cnt_F
Min. : 1.0 Min. : 0.0 Min. : 1.00 Min. : 0.00
1st Qu.: 178.0 1st Qu.: 160.0 1st Qu.: 28.00 1st Qu.: 24.00
Median : 295.0 Median : 266.0 Median : 48.00 Median : 42.00
Mean : 363.3 Mean : 327.1 Mean : 60.01 Mean : 53.31
3rd Qu.: 476.0 3rd Qu.: 428.0 3rd Qu.: 78.00 3rd Qu.: 70.00
Max. :1543.0 Max. :1388.0 Max. :663.00 Max. :595.00
TravelTime_Sec TT_Sec_SS_q5 TT_Sec_SS_q95 TT_Sec_SSHG_q5
Min. : 1 Min. : 1.00 Min. : 1.00 Min. : 1.00
1st Qu.: 24 1st Qu.: 15.00 1st Qu.: 47.00 1st Qu.: 15.20
Median : 38 Median : 21.00 Median : 77.75 Median : 22.50
Mean : 100 Mean : 57.38 Mean : 176.22 Mean : 62.94
3rd Qu.: 70 3rd Qu.: 32.00 3rd Qu.: 129.65 3rd Qu.: 34.80
Max. :54551 Max. :54551.00 Max. :54551.00 Max. :54551.00
NA's :28
TT_Sec_SSHG_q95 TT_Sec_SS_Mean TT_Sec_SS_Mean_F TT_Sec_SSHG_Mean
Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. : 1.00
1st Qu.: 42.70 1st Qu.: 28.20 1st Qu.: 26.62 1st Qu.: 27.51
Median : 70.55 Median : 42.61 Median : 40.46 Median : 41.76
Mean : 161.25 Mean : 99.62 Mean : 86.96 Mean : 99.55
3rd Qu.: 119.60 3rd Qu.: 69.71 3rd Qu.: 66.44 3rd Qu.: 70.02
Max. :54551.00 Max. :54551.00 Max. :54551.00 Max. :54551.00
NA's :2603
TT_Sec_SSHG_Mean_F TT_Sec_SS_Med TT_Sec_SS_Med_F TT_Sec_SSHG_Med
Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. : 1.00
1st Qu.: 26.34 1st Qu.: 25.00 1st Qu.: 25.00 1st Qu.: 25.00
Median : 39.96 Median : 37.00 Median : 37.00 Median : 37.00
Mean : 88.81 Mean : 86.88 Mean : 80.62 Mean : 90.07
3rd Qu.: 67.22 3rd Qu.: 62.00 3rd Qu.: 62.00 3rd Qu.: 64.00
Max. :54551.00 Max. :54551.00 Max. :54551.00 Max. :54551.00
NA's :3772 NA's :2603
TT_Sec_SSHG_Med_F TT_Sec_SS_Cnt TT_Sec_SS_Cnt_F TT_Sec_SSHG_Cnt
Min. : 1.00 Min. : 1.0 Min. : 0.0 Min. : 1.0
1st Qu.: 25.00 1st Qu.: 200.0 1st Qu.: 183.0 1st Qu.: 30.0
Median : 37.00 Median : 321.0 Median : 292.0 Median : 52.0
Mean : 83.87 Mean : 392.4 Mean : 357.2 Mean : 64.7
3rd Qu.: 64.00 3rd Qu.: 509.0 3rd Qu.: 464.0 3rd Qu.: 84.0
Max. :54551.00 Max. :1664.0 Max. :1523.0 Max. :691.0
NA's :3772
TT_Sec_SSHG_Cnt_F TravelTime_Hr TT_Hr_SS_q5 TT_Hr_SS_q95
Min. : 0.00 Min. : 0.000278 Min. : 0.000278 Min. : 0.000278
1st Qu.: 27.00 1st Qu.: 0.006667 1st Qu.: 0.004167 1st Qu.: 0.013056
Median : 47.00 Median : 0.010556 Median : 0.005833 Median : 0.021597
Mean : 58.23 Mean : 0.027782 Mean : 0.015938 Mean : 0.048950
3rd Qu.: 76.00 3rd Qu.: 0.019444 3rd Qu.: 0.008889 3rd Qu.: 0.036014
Max. :634.00 Max. :15.153056 Max. :15.153056 Max. :15.153056
NA's :28
TT_Hr_SSHG_q5 TT_Hr_SSHG_q95 TT_Hr_SS_Mean TT_Hr_SS_Mean_F
Min. : 0.000278 Min. : 0.000278 Min. : 0.000278 Min. : 0.0003
1st Qu.: 0.004222 1st Qu.: 0.011861 1st Qu.: 0.007832 1st Qu.: 0.0074
Median : 0.006250 Median : 0.019597 Median : 0.011836 Median : 0.0112
Mean : 0.017485 Mean : 0.044792 Mean : 0.027673 Mean : 0.0242
3rd Qu.: 0.009667 3rd Qu.: 0.033222 3rd Qu.: 0.019363 3rd Qu.: 0.0185
Max. :15.153056 Max. :15.153056 Max. :15.153056 Max. :15.1531
NA's :2612
TT_Hr_SSHG_Mean TT_Hr_SSHG_Mean_F TT_Hr_SS_Med TT_Hr_SS_Med_F
Min. : 0.000278 Min. : 0.000 Min. : 0.000278 Min. : 0.0003
1st Qu.: 0.007643 1st Qu.: 0.007 1st Qu.: 0.006944 1st Qu.: 0.0069
Median : 0.011600 Median : 0.011 Median : 0.010278 Median : 0.0103
Mean : 0.027654 Mean : 0.025 Mean : 0.024132 Mean : 0.0224
3rd Qu.: 0.019450 3rd Qu.: 0.019 3rd Qu.: 0.017222 3rd Qu.: 0.0172
Max. :15.153056 Max. :15.153 Max. :15.153056 Max. :15.1531
NA's :3842 NA's :2612
TT_Hr_SSHG_Med TT_Hr_SSHG_Med_F TT_Hr_SS_Cnt TT_Hr_SS_Cnt_F
Min. : 0.000278 Min. : 0.000 Min. : 1.0 Min. : 0
1st Qu.: 0.006944 1st Qu.: 0.007 1st Qu.: 200.0 1st Qu.: 183
Median : 0.010278 Median : 0.010 Median : 321.0 Median : 292
Mean : 0.025019 Mean : 0.023 Mean : 392.4 Mean : 357
3rd Qu.: 0.017778 3rd Qu.: 0.018 3rd Qu.: 509.0 3rd Qu.: 464
Max. :15.153056 Max. :15.153 Max. :1664.0 Max. :1523
NA's :3842
TT_Hr_SSHG_Cnt TT_Hr_SSHG_Cnt_F SpeedAvg_Mph
Min. : 1.0 Min. : 0.00 Min. : 0.00
1st Qu.: 30.0 1st Qu.: 27.00 1st Qu.: 10.10
Median : 52.0 Median : 47.00 Median : 16.68
Mean : 64.7 Mean : 58.19 Mean : 26.54
3rd Qu.: 84.0 3rd Qu.: 76.00 3rd Qu.: 31.17
Max. :691.0 Max. :634.00 Max. :22924.09
NA's :28
Investigation of TravelDistance_Mi.
Let’s plot just the TravelDistance_Mi values that are NOT “NA”.
Investigation of TravelDistance_Mi.
Looking at the extremely large TravelDistance_Mi values. Some (aprox 27%) of TravelDistance_Mi values > 1 mile are when the DirChange2 changes…but what about the other ~73%?
Investigation of TravelDistance_Mi.
Any relation with DirChange2? Doesn’t look as if this is so.
prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
),
1
)
False True
Change 0.76966102 0.23033898
Same 0.98944289 0.01055711
prop.table(as.table(as.matrix(ExtremeTravDist_Spread)
),
2
)
False True
Change 0.01578567 0.31028288
Same 0.98421433 0.68971712
Investigation of TravelDistance_Mi.
Looking at specific buses and StartStop_ID.
Investigation of TravelDistance_Mi & TravelDistance_Mi_New.
If TravelDisntace_Mi is below the 5th percentile for that StartStop_ID, or if TravelDisntace_Mi is above the 95th percentile for that StartStop_ID, or if TravelDistance_Mi is NA (when the BusDay_EventNum !=1), consider this an outlier. In this case, replace the value with the mean for that StartStop_ID and HourGroup (TD_Mi_SSHG_Mean_F), or if there are not enough values at the HourGroup level, replace it with the mean for that StartStop_ID.
# View(tail(AllDays_NewOrder, 500))
AllDays_NewTravelDist <-
mutate(AllDays_NewOrder,
TravelDistance_Mi_New = ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SSHG_Cnt_F >= 20,
TD_Mi_SSHG_Mean_F,
ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SSHG_Cnt_F < 20 &
TD_Mi_SS_Cnt_F >= 20,
TD_Mi_SS_Mean_F,
ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SS_Cnt_F < 20 &
TD_Mi_SS_Cnt >= 20,
TD_Mi_SS_Mean,
ifelse(is.na(TravelDistance_Mi) &
BusDay_EventNum != 1 &
TravelDistance_Mi_Hvrs != 0,
TravelDistance_Mi_Hvrs,
ifelse(is.na(TravelDistance_Mi) &
BusDay_EventNum != 1 &
TravelDistance_Mi_Hvrs == 0,
TD_Mi_SS_Mean,
TravelDistance_Mi
))))),
TravelDistance_Mi_New_Label =
factor(ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SSHG_Cnt_F >= 20,
"TD_Mi_SSHG_Mean_F",
ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SSHG_Cnt_F < 20 &
TD_Mi_SS_Cnt_F >= 20,
"TD_Mi_SS_Mean_F",
ifelse(!is.na(TravelDistance_Mi) &
(TravelDistance_Mi < TD_Mi_SSHG_q5 |
TravelDistance_Mi > TD_Mi_SSHG_q95
) &
TD_Mi_SS_Cnt_F < 20 &
TD_Mi_SS_Cnt >= 20,
"TD_Mi_SS_Mean",
ifelse(is.na(TravelDistance_Mi) &
BusDay_EventNum != 1 &
TravelDistance_Mi_Hvrs != 0,
"TravelDistance_Mi_Hvrs",
ifelse(is.na(TravelDistance_Mi) &
BusDay_EventNum != 1 &
TravelDistance_Mi_Hvrs == 0,
TD_Mi_SS_Mean,
"TravelDistance_Mi"
)))))
)
)
str(AllDays_NewTravelDist)
'data.frame': 2809529 obs. of 91 variables:
$ RowNum_OG : int 1 3 4 5 6 7 9 10 11 12 ...
$ group : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
$ StartStop_ID : chr "NULL--5004572" "5004572--5004573" "5004573--5002210" "5002210--5002209" ...
$ BusDay_EventNum : int 1 2 3 4 5 6 7 8 9 10 ...
$ Bus_ID : int 11 11 11 11 11 11 11 11 11 11 ...
$ Route : chr "S80" "S80" "S80" "S80" ...
$ RouteAlt : Factor w/ 14 levels "1","10","11",..: 1 1 1 1 1 1 6 6 6 6 ...
$ DirChange2 : Factor w/ 2 levels "Change","Same": 1 2 2 2 2 2 1 2 2 2 ...
$ Route_Direction : Factor w/ 12 levels "","ANTICLKW",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Stop_Sequence : int 7 6 3 2 8 1 2 3 4 2 ...
$ Start_ID : chr NA "5004572" "5004573" "5002210" ...
$ Start_Desc : chr NA "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" ...
$ StopID_Clean : chr "5004572" "5004573" "5002210" "5002209" ...
$ StopID_Indicator : Factor w/ 2 levels "ID_Bad","ID_OK": 2 2 2 2 2 2 2 2 2 2 ...
$ Stop_Desc : chr "BEULAH ST + CHARLES ARRINGTON DR" "WALKER LN + #6363" "WALKER LN + BEULAH ST" "BEULAH ST + CHARLES ARRINGTON DR" ...
$ Event_Type : int 4 4 4 4 3 3 4 4 4 4 ...
$ Event_Description : Factor w/ 3 levels "Serviced Stop ",..: 3 3 3 3 1 1 3 3 3 3 ...
$ Event_Time_Yr : int 2016 2016 2016 2016 2016 2016 2016 2016 2016 2016 ...
$ Event_Time_Mth : int 10 10 10 10 10 10 10 10 10 10 ...
$ Event_Time_Date : int 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Day : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tues"<..: 2 2 2 2 2 2 2 2 2 2 ...
$ Event_Time_Hr : int 6 6 6 6 6 6 6 6 6 6 ...
$ Event_Time_HrGroup : Ord.factor w/ 8 levels "Group0_2"<"Group3_5"<..: 3 3 3 3 3 3 3 3 3 3 ...
$ Event_Time_Min : int 6 9 10 10 13 14 21 21 23 23 ...
$ Event_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Departure_Time : POSIXct, format: "2016-10-03 06:06:47" "2016-10-03 06:09:47" ...
$ Dwell_Time : int 0 0 0 0 0 104 0 0 0 0 ...
$ Dwell_Time2 : num 0 0 0 0 0 104 0 0 0 0 ...
$ Delta_Time : int -177 24 165 25 73 719 74 76 63 69 ...
$ Latitude : num 38.8 38.8 38.8 38.8 38.8 ...
$ Longitude : num -77.2 -77.2 -77.2 -77.2 -77.2 ...
$ Heading : int 199 97 276 15 119 100 274 104 241 274 ...
$ Odometer_Distance : int 43543 45139 46418 50115 51074 51303 55633 56163 56285 57262 ...
$ Odometer_Distance_Lag1 : int NA 43543 45139 46418 50115 51074 51303 55633 56163 56285 ...
$ Odometer_Distance_Mi : num 8.25 8.55 8.79 9.49 9.67 ...
$ TravelDistance_Ft : int NA 1596 1279 3697 959 229 4330 530 122 977 ...
$ TravelDistance_Mi : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_Hvrs : num NA 0.15 0.105 0.165 0.832 ...
$ TD_Mi_SS_q5 : num NA 0.0252 0.2422 0.7324 0.0794 ...
$ TD_Mi_SS_q95 : num NA 0.626 0.242 1.008 0.176 ...
$ TD_Mi_SSHG_q5 : num NA 0.0996 0.2422 0.7002 0.1816 ...
$ TD_Mi_SSHG_q95 : num NA 0.627 0.242 0.7 0.182 ...
$ TD_Mi_SS_Mean : num NaN 0.437 0.242 0.908 0.128 ...
$ TD_Mi_SS_Mean_F : num NaN 0.457 0.242 0.977 NaN ...
$ TD_Mi_SSHG_Mean : num NaN 0.442 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Mean_F : num NaN 0.491 0.242 0.7 0.182 ...
$ TD_Mi_SS_Med : num NA 0.512 0.242 0.962 0.128 ...
$ TD_Mi_SS_Med_F : num NA 0.512 0.242 1.008 NA ...
$ TD_Mi_SSHG_Med : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_SSHG_Med_F : num NA 0.512 0.242 0.7 0.182 ...
$ TD_Mi_SS_Cnt : int 0 14 1 4 2 87 22 118 91 11 ...
$ TD_Mi_SS_Cnt_F : int 0 12 1 3 0 77 18 106 81 9 ...
$ TD_Mi_SSHG_Cnt : int 0 7 1 1 1 23 6 29 28 3 ...
$ TD_Mi_SSHG_Cnt_F : int 0 5 1 1 1 19 4 25 24 1 ...
$ TravelTime_Sec : num NA 180 37 25 190 29 288 52 76 8 ...
$ TT_Sec_SS_q5 : num NA 11.9 37 30.5 172.9 ...
$ TT_Sec_SS_q95 : num NA 346.3 37 75.8 189.1 ...
$ TT_Sec_SSHG_q5 : num NA 59.6 37 25 190 11.6 236 51.5 55 8.8 ...
$ TT_Sec_SSHG_q95 : num NA 276 37 25 190 ...
$ TT_Sec_SS_Mean : num NaN 215.8 37 58.2 181 ...
$ TT_Sec_SS_Mean_F : num NaN 218.9 37 65.5 NaN ...
$ TT_Sec_SSHG_Mean : num NaN 202 37 25 190 ...
$ TT_Sec_SSHG_Mean_F : num NaN 226 37 25 190 ...
$ TT_Sec_SS_Med : num NA 223.5 37 65.5 181 ...
$ TT_Sec_SS_Med_F : num NA 223.5 37 65.5 NA ...
$ TT_Sec_SSHG_Med : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_SSHG_Med_F : num NA 219 37 25 190 134 286 60 65 16 ...
$ TT_Sec_SS_Cnt : int 0 14 1 4 2 173 22 141 141 11 ...
$ TT_Sec_SS_Cnt_F : int 0 12 1 2 0 156 18 127 128 9 ...
$ TT_Sec_SSHG_Cnt : int 0 7 1 1 1 35 6 36 35 3 ...
$ TT_Sec_SSHG_Cnt_F : int 0 5 1 1 1 31 4 32 32 1 ...
$ TravelTime_Hr : num NA 0.05 0.01028 0.00694 0.05278 ...
$ TT_Hr_SS_q5 : num NA 0.00331 0.01028 0.00849 0.04803 ...
$ TT_Hr_SS_q95 : num NA 0.0962 0.0103 0.0211 0.0525 ...
$ TT_Hr_SSHG_q5 : num NA 0.01656 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_q95 : num NA 0.07653 0.01028 0.00694 0.05278 ...
$ TT_Hr_SS_Mean : num NaN 0.0599 0.0103 0.0162 0.0503 ...
$ TT_Hr_SS_Mean_F : num NaN 0.0608 0.0103 0.0182 NaN ...
$ TT_Hr_SSHG_Mean : num NaN 0.05615 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Mean_F : num NaN 0.06278 0.01028 0.00694 0.05278 ...
$ TT_Hr_SS_Med : num NA 0.0621 0.0103 0.0182 0.0503 ...
$ TT_Hr_SS_Med_F : num NA 0.0621 0.0103 0.0182 NA ...
$ TT_Hr_SSHG_Med : num NA 0.06083 0.01028 0.00694 0.05278 ...
$ TT_Hr_SSHG_Med_F : num NA 0.06083 0.01028 0.00694 0.05278 ...
$ TT_Hr_SS_Cnt : int 0 14 1 4 2 173 22 141 141 11 ...
$ TT_Hr_SS_Cnt_F : int 0 12 1 2 0 156 18 127 128 9 ...
$ TT_Hr_SSHG_Cnt : int 0 7 1 1 1 35 6 36 35 3 ...
$ TT_Hr_SSHG_Cnt_F : int 0 5 1 1 1 31 4 32 28 1 ...
$ SpeedAvg_Mph : num NA 6.05 23.57 100.83 3.44 ...
$ TravelDistance_Mi_New : num NA 0.302 0.242 0.7 0.182 ...
$ TravelDistance_Mi_New_Label: Factor w/ 65 levels "0.000568181818181818",..: 64 64 64 64 64 64 64 63 63 64 ...
Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.
Quick summary and then correlation calculation.
summary(select(AllDays_NewTravelDist,
TravelDistance_Mi,
TravelDistance_Mi_Hvrs,
TravelDistance_Mi_New
)
)
TravelDistance_Mi TravelDistance_Mi_Hvrs TravelDistance_Mi_New
Min. : 0.0 Min. : 0.000 Min. : 0.000
1st Qu.: 0.1 1st Qu.: 0.106 1st Qu.: 0.141
Median : 0.2 Median : 0.142 Median : 0.199
Mean : 0.3 Mean : 0.201 Mean : 0.298
3rd Qu.: 0.3 3rd Qu.: 0.193 3rd Qu.: 0.276
Max. :250.7 Max. :24.407 Max. :250.656
NA's :322734 NA's :6528 NA's :6566
summary(select(filter(AllDays_NewTravelDist,
BusDay_EventNum != 1
),
TravelDistance_Mi,
TravelDistance_Mi_Hvrs,
TravelDistance_Mi_New
)
)
TravelDistance_Mi TravelDistance_Mi_Hvrs TravelDistance_Mi_New
Min. : 0.00 Min. : 0.0000 Min. : 0.00019
1st Qu.: 0.13 1st Qu.: 0.1055 1st Qu.: 0.14072
Median : 0.20 Median : 0.1424 Median : 0.19867
Mean : 0.31 Mean : 0.2008 Mean : 0.29751
3rd Qu.: 0.29 3rd Qu.: 0.1935 3rd Qu.: 0.27633
Max. :250.66 Max. :24.4068 Max. :250.65606
NA's :316206 NA's :38
cor(select(AllDays_NewTravelDist,
TravelDistance_Mi,
TravelDistance_Mi_Hvrs,
TravelDistance_Mi_New
),
use = "pairwise.complete.obs"
)
TravelDistance_Mi TravelDistance_Mi_Hvrs
TravelDistance_Mi 1.0000000 0.5447660
TravelDistance_Mi_Hvrs 0.5447660 1.0000000
TravelDistance_Mi_New 0.9513379 0.5837182
TravelDistance_Mi_New
TravelDistance_Mi 0.9513379
TravelDistance_Mi_Hvrs 0.5837182
TravelDistance_Mi_New 1.0000000
Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.
Graphing the two methods of calculating TravelDistance_Mi.
First, let’s get create a function to plot the liner model equation.
lm_eqn <- function(df, y, x){
m <- lm(y ~ x, df)
l <- list(a = format(coef(m)[1], digits = 2),
b = format(abs(coef(m)[2]), digits = 2),
s1 = ifelse(test = coef(m)[2] > 0,
yes = "+",
no = "-"
),
r2 = format(summary(m)$r.squared,
digits = 3
)
)
eq <- substitute(italic(y) == a~~s1~~b %.% italic(x)*","~~italic(r)^2~"="~r2,
l
)
as.character(as.expression(eq)
)
}
Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.
Scatter plot (using a 10% sample to making plotting time faster and to reduce un-needed data in the “same” splot).
set.seed(123456789)
AllDays_NewTravelDist_10Pct <- filter(AllDays_NewTravelDist,
!is.na(TravelDistance_Mi)
) %>%
rename(DistMethod = TravelDistance_Mi_New_Label) %>%
sample_frac(0.1)
TravDist_MiVsCalc <- ggplot(select(AllDays_NewTravelDist_10Pct,
TravelDistance_Mi_New,
TravelDistance_Mi,
DistMethod
),
aes(x = TravelDistance_Mi_New,
y = TravelDistance_Mi,
colour = DistMethod
)
) +
scale_colour_manual(values = c("red","blue", "green", "orange")
) +
geom_point(shape = 1, alpha = 0.5) +
scale_shape(solid = FALSE) +
geom_smooth(method = "lm", colour = "blue") +
geom_abline(intercept = 0, slope = 1, colour = "red") +
coord_cartesian(xlim = c(0, 1.5), ylim = c(0, 1.5)
) +
scale_x_continuous(breaks = seq(0, 1.5, 0.25)
) +
scale_y_continuous(breaks = seq(0, 1.5, 0.25)
) +
theme(legend.position = c(0.85, 0.40),
legend.text = element_text(size = 8)
) +
annotate(label = lm_eqn(df = AllDays_NewTravelDist_10Pct,
y = AllDays_NewTravelDist_10Pct$TravelDistance_Mi,
x = AllDays_NewTravelDist_10Pct$TravelDistance_Mi_New
),
# x = 62,
# y = 20,
x = 1.15,
y = 1.45,
geom = "text",
size = 3,
colour = "blue",
parse = TRUE
) +
annotate(label = "Reference Line (slope = 1)",
# x = 16,
# y = 30,
x = 1.32,
y = 1.05,
geom = "text",
size = 3,
colour = "red"
) +
labs(title = "TravelDistance_Mi vs. TravelDistance_Mi_New",
x = "TravelDistance_Mi_New",
y = "TravelDistance_Mi"
)
# +
# geom_jitter()
TravDist_MiVsCalc
Investigation of TravelDistance_Mi & TravelDistance_Mi_Hvrs & TravelDistance_Mi_New.
Graphing test with rbokeh.
TravDist_MiVsCalc_Bokeh <- figure(data = select(AllDays_NewTravelDist_10Pct,
TravelDistance_Mi_New,
TravelDistance_Mi,
DistMethod
),
xlim = c(0, 1.5),
ylim = c(0, 1.5),
legend_location = "bottom_right"
) %>%
ly_points(x = TravelDistance_Mi_New,
y = TravelDistance_Mi,
color = DistMethod,
hover = c(TravelDistance_Mi_New, TravelDistance_Mi, DistMethod)
) %>%
ly_abline(a = 0, b = 1, color = "red")
TravDist_MiVsCalc_Bokeh